We will compare some classifiers on the “Toxic” column.

Load libraries

library(tidyverse)
source("./parameters.R")

Open the Bag of Word with labels

# We open a relatively small Bag of Words in order to limit calculation time
df = read_csv("bow_words_100_ngrams_1000_rows_balanced_n_2_cut_0.95_tfidf.csv", col_types=col_types_df)
df = df[,-c(2,3,5:9)]
df

Splitting the data

df_train = df[df[1] == 1,-1]
df_test  = df[df[1] == 2,-1]

# Let's control the labels balance
table(df_train$df_toxic)

    0     1 
13934 14042 
table(df_test$df_toxic)

   0    1 
5372 5444 
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpXZSB3aWxsIGNvbXBhcmUgc29tZSBjbGFzc2lmaWVycyBvbiB0aGUgIlRveGljIiBjb2x1bW4uDQoNCkxvYWQgbGlicmFyaWVzDQoNCmBgYHtyfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpzb3VyY2UoIi4vcGFyYW1ldGVycy5SIikNCmBgYA0KDQoNCiMgT3BlbiB0aGUgQmFnIG9mIFdvcmQgd2l0aCBsYWJlbHMNCg0KYGBge3J9DQojIFdlIG9wZW4gYSByZWxhdGl2ZWx5IHNtYWxsIEJhZyBvZiBXb3JkcyBpbiBvcmRlciB0byBsaW1pdCBjYWxjdWxhdGlvbiB0aW1lDQpkZiA9IHJlYWRfY3N2KCJib3dfdGZpZGZfX21pbl93b3Jkc18xMDBfMmdyYW1zXzEwMDBfX3NhbXBsaW5nX2JhbGFuY2VkX19jb3JfY3V0XzAuMV9mcm9tXzE0MDhfdG9fNTQ0X3JtMCIsIGNvbF90eXBlcz1jb2xfdHlwZXNfZGYpDQpkZiA9IGRmWywtYygyLDMsNTo5KV0NCmRmDQpgYGANCg0KDQojIFNwbGl0dGluZyB0aGUgZGF0YQ0KDQpgYGB7cn0NCmRmX3RyYWluID0gZGZbZGZbMV0gPT0gMSwtMV0NCmRmX3Rlc3QgID0gZGZbZGZbMV0gPT0gMiwtMV0NCg0KIyBMZXQncyBjb250cm9sIHRoZSBsYWJlbHMgYmFsYW5jZQ0KdGFibGUoZGZfdHJhaW4kZGZfdG94aWMpDQp0YWJsZShkZl90ZXN0JGRmX3RveGljKQ0KYGBgDQoNCg0KDQo=